path <- "/Users/acas/Dropbox/Post-docs/Roux_Lab/messor_analyses/popgen_analyses/clustering_PCA/admixture/MaciRef_vcfAllfRef"
dt <- read.delim("/Users/acas/Dropbox/Post-docs/Roux_Lab/messor_analyses/messor_contig_AllGeneticData.txt") %>%
    rename(Ind = sra_access_number) %>%
    mutate(lineage = case_when(mitochondrial %in% "" ~ species, TRUE ~ mitochondrial))

### not very efficient function, need to be sure of each maxK per analysis
### clusType either Major or all
admixturePlot <- function(path = path, analysis = analysis, K = K, clusType = clusType,
    labelSize = labelSize) {

    samples <- read.table(list.files(paste0(path, "/", analysis), pattern = "_inds.txt",
        recursive = F, full.names = T))$V1
    lst <- list.files(paste0(path, "/", analysis, "/CLUMPAK"), pattern = "ClumppIndFile.output",
        recursive = T, full.names = T)
    lst <- lst[str_detect(lst, "Cluster") & !str_detect(lst, "K=1")]

    dtset <- dt %>%
        filter(Ind %in% samples) %>%
        arrange(Ind %in% samples) %>%
        mutate(order = paste0("V", 1:nrow(.)), ID = paste0(substring(caste, 1, 1),
            "_", Ind)) %>%
        rename(pop = lineage) %>%
        select(ID, pop, order)

    dtCluster <- data.frame()
    for (i in lst) {
        outfile <- read.table(i, header = FALSE) %>%
            select(-(V1:V5))

        setLabel <- gsub("/", "_", gsub("/CLUMPP.files/ClumppIndFile.output", "",
            unlist(str_split(i, pattern = "CLUMPAK/"))[2]))

        colnames(outfile) <- paste0("K", 1:ncol(outfile))

        outfile <- outfile %>%
            mutate(order = paste0("V", 1:nrow(.)), set = setLabel) %>%
            left_join(dtset, by = "order") %>%
            pivot_longer(-c(order, ID, pop, set), names_to = "cluster") %>%
            arrange(pop)

        dtCluster <- bind_rows(dtCluster, outfile)
    }

    clusType <- ifelse(clusType %in% "Major", "Major", "Major|Minor")
    maxK <- max(str_remove(dtCluster$cluster, "K"))
    K <- ifelse(K != maxK, K, maxK)

    dtCluster1 <- dtCluster %>%
        separate(set, into = c("kSet", "clusterType"), sep = "_", remove = F) %>%
        filter(str_detect(kSet, paste(seq(1, K), collapse = "|")), str_detect(clusterType,
            clusType))

    p1 = ggplot(data = dtCluster1, aes(x = ID, y = value, fill = cluster)) + geom_bar(stat = "identity",
        show.legend = FALSE, colour = "black", width = 1, size = 0.1) + facet_grid(set ~
        pop, scales = "free", space = "free") + theme_bw() + scale_fill_iwanthue() +
        theme(plot.background = element_blank(), panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(), panel.border = element_blank(), panel.spacing = unit(0.1,
                "lines"), strip.background = element_rect(fill = "gray95"), strip.text.y = element_text(size = labelSize,
                angle = 0), strip.text.x = element_text(size = labelSize), axis.title.y = element_blank(),
            axis.title.x = element_blank(), axis.text.y = element_text(size = labelSize),
            axis.text.x = element_text(angle = 30, vjust = 1.2, hjust = 1, size = labelSize),
            legend.position = "none", strip.placement = "outside") + scale_y_continuous(expand = c(0,
        0))
    return(p1)
}

cvPlot <- function(path = path, analysis = analysis) {

    dt <- list.files(paste0(path, "/", analysis, "/stdout"), pattern = "stdout",
        recursive = T, full.names = T)

    cvs <- data.frame()
    for (cv in dt) {
        cvd <- read.delim(cv)
        cvs <- bind_rows(cvs, data.frame(run = word(cv, -2, sep = fixed(".")), cvError = cvd[str_detect(cvd[,
            1], "CV error"), ]))
    }

    cvs <- cvs %>%
        mutate(cvError = word(cvError, sep = ": ", 2, 2)) %>%
        separate(run, into = c("K", "rep"), sep = "_")

    clst <- list.files(paste0(path, "/", analysis, "/CLUMPAK"), pattern = "MCL.clusters|FilesToIndex",
        recursive = T, full.names = T)
    clst <- clst[!str_detect(clst, "Cluster")]

    tpp <- data.frame()
    for (cls in 1:(nrow(cvs)/20)) {
        lss <- clst[str_detect(clst, paste0("K=", cls, "/"))]

        mcl <- read.table(lss[str_detect(lss, "MCL")], sep = "\t")
        clus <- str_split(mcl$V1, " ")

        if (length(clus) > 1) {
            names(clus) <- c("majorCluster", paste0("minorCluster", seq(1, (length(clus) -
                1))))
        } else {
            names(clus) <- "majorCluster"
        }

        fti <- read.table(lss[str_detect(lss, "FilesToIndex")]) %>%
            mutate(V2 = word(V2, -3, sep = fixed("."))) %>%
            separate(col = V2, sep = "_", into = c("K", "rep"))
        fti$cluster <- NA
        for (j in names(clus)) {
            fti[fti$V1 %in% clus[[j]], "cluster"] <- j
        }
        tpp <- bind_rows(tpp, fti)
    }

    cvs <- left_join(cvs, tpp, by = c("K", "rep"))

    ggplot(cvs, aes(x = fct_inseq(K), y = as.numeric(cvError), color = cluster)) +
        geom_boxplot(position = position_dodge(0.9)) + geom_point(position = position_dodge(0.9)) +
        geom_text_repel(aes(label = rep), position = position_dodge(0.9)) + theme_bw() +
        labs(x = "K", y = "CV value", title = "Variability in cross validation") +
        theme(legend.position = "bottom")
}

Admixture analyses were run with per analysis VCF files obtained from merging per sample VCF files. This was done to decrease time of bcftools mpileup per dataset, but to improve overlap of sites across RNAseq and WGS samples a .bed file was created with a set of SNPs including a little over 1 million sites.

The .bed file was it self created from the cleaning of a qsl2 vcf (subset of RNAseq queen samples representing the different mitochondrial lineages). The filtering of this vcf file retrieved biallelic sites (indels excluded) with Qual above 20, read depth above 10 and allowing 10% missing data across sites (script subMakeBed.sh).

Each sample VCF was called with keeping all sites, filtering (’FORMAT/DP>10 && QUAL>20) and then subsetting SNP sites from created .bed file. The final merged VCF file was then filtered for biallelic sites thinned every 500bp and allowing 25% missing data per site.

Admixture analyses were run aided by the admixturePipeline.py (https://github.com/stevemussmann/admixturePipeline) and evaluated with CLUMPAK.

Analyses were run for a dataset with all queens and seven subclades datasets the include queens and workers. These datasets focus on the three social hybridogenesis groups of species: Mbar only and Mbar with close relatives; Mebe only and Mebe with close relatives; and M. structor as well as M. structor lineages split between the two main clades: Ibericus + ponticus + mcarthuri (Mstr127) and muticus + structor3 + structor4 (Mstr346).


Messor Queens

analysis <- "MessorQueens"
K = 6
clusType = "Major"
labelSize = 6

samples <- read.table(list.files(paste0(path, "/", analysis), pattern = "_inds.txt",
    recursive = F, full.names = T))$V1
lst <- list.files(paste0(path, "/", analysis, "/CLUMPAK"), pattern = "ClumppIndFile.output",
    recursive = T, full.names = T)
lst <- lst[str_detect(lst, "Cluster") & !str_detect(lst, "K=1")]

dtset <- dt %>%
    filter(Ind %in% samples) %>%
    arrange(Ind %in% samples) %>%
    mutate(order = paste0("V", 1:nrow(.)), ID = paste0(substring(caste, 1, 1), "_",
        Ind)) %>%
    rename(pop = species) %>%
    select(ID, pop, order)

dtCluster <- data.frame()
for (i in lst) {
    outfile <- read.table(i, header = FALSE) %>%
        select(-(V1:V5))

    setLabel <- gsub("/", "_", gsub("/CLUMPP.files/ClumppIndFile.output", "", unlist(str_split(i,
        pattern = "CLUMPAK/"))[2]))

    colnames(outfile) <- paste0("K", 1:ncol(outfile))

    outfile <- outfile %>%
        mutate(order = paste0("V", 1:nrow(.)), set = setLabel) %>%
        left_join(dtset, by = "order") %>%
        pivot_longer(-c(order, ID, pop, set), names_to = "cluster") %>%
        arrange(pop)

    dtCluster <- bind_rows(dtCluster, outfile)
}

clusType <- ifelse(clusType %in% "Major", "Major", "Major|Minor")
maxK <- max(str_remove(dtCluster$cluster, "K"))
K <- ifelse(K != maxK, K, maxK)

dtCluster1 <- dtCluster %>%
    separate(set, into = c("kSet", "clusterType"), sep = "_", remove = F) %>%
    filter(str_detect(kSet, paste(seq(1, K), collapse = "|")), str_detect(clusterType,
        clusType))

p1 = ggplot(data = dtCluster1, aes(x = ID, y = value, fill = cluster)) + geom_bar(stat = "identity",
    show.legend = FALSE, colour = "black", width = 1, size = 0.1) + facet_grid(set ~
    pop, scales = "free", space = "free") + theme_bw() + scale_fill_iwanthue() +
    theme(plot.background = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), panel.border = element_blank(), panel.spacing = unit(0.1,
            "lines"), strip.text.y = element_text(size = labelSize, angle = 0), strip.text.x = element_text(size = labelSize,
            angle = 90), axis.text.x = element_text(angle = 30, vjust = 1.2, hjust = 1,
            size = labelSize), axis.title.x = element_blank(), axis.ticks.x = element_blank(),
        axis.title.y = element_blank(), axis.text.y = element_text(size = labelSize),
        legend.position = "none", strip.placement = "outside")
p1

locusID <- read.delim(paste0(path, "/messorQueens/MessorQueens_filteredThin.recode.map"),
    header = F)

Using 33151 SNPs.

cvPlot(path, analysis)


M. barbarus system

M.barbarus + Decipiens + Capitatus

analysis <- "MbarDecCap"

admixturePlot(path, analysis, K = 4, "all", 8)

locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_filteredThin.recode.map"),
    header = F)

Using 25913 SNPs.

cvPlot(path, analysis)


M.barbarus

analysis <- "Mbar"

admixturePlot(path, analysis, K = 4, "all", 8)

locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_filteredThin.recode.map"),
    header = F)

Using 22365 SNPs.

cvPlot(path, analysis)


M.barbarus queens

analysis <- "MbarQueens"

admixturePlot(path, analysis, K = 2, "all", 8)

locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_filteredThin.recode.map"),
    header = F)

Using 26072 SNPs.

cvPlot(path, analysis)

The analysis with just queens confirms structuring within M. barbarus even though this is not very clear in the analysis including the workers. In the the analysis with workers sample SRR4292909 is not more closely related to the other queens from the same mtDNA clade but to two others workers that occur in different geographic locations.

M.barbarus with CRAM file

analysis <- "MbarCRAM"

admixturePlot(path, analysis, K = 4, "all", 8)

locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_cramfilteredThin.recode.map"),
    header = F)

Using 107880 SNPs.

cvPlot(path, analysis)


M.ebeninus system

M.ebeninus + Wasmani + Minor

analysis <- "MebeWesMin"

admixturePlot(path, analysis, K = 4, "all", 8)

locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_filteredThin.recode.map"),
    header = F)

Using 16416 SNPs.

cvPlot(path, analysis)


M.ebeninus

analysis <- "Mebe"

admixturePlot(path, analysis, K = 4, "all", 8)

locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_filteredThin.recode.map"),
    header = F)

Using 14540 SNPs.

cvPlot(path, analysis)


M.ebeninus queens

analysis <- "MebeQueens"

admixturePlot(path, analysis, K = 3, "all", 8)

locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_filteredThin.recode.map"),
    header = F)

Using 12753 SNPs.

cvPlot(path, analysis)

The analysis with just queens supports the lack of structure for M. ebeninus lineages, i.e. M. ebeninus samples represent a panmitic population.


M.structor system

All M.structor

analysis <- "Mstr"

admixturePlot(path, analysis, K = 8, "Major", 7)

locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_filteredThin.recode.map"),
    header = F)

Using 28361 SNPs.

cvPlot(path, analysis)


Mstr127 - Ibericus + Ponticus + Mcarthuri

analysis <- "Mstr127"

admixturePlot(path, analysis, K = 5, "all", 7)

locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_filteredThin.recode.map"),
    header = F)

Using 26095 SNPs.

cvPlot(path, analysis)


Mstr346 - Structor3 + Structor4 + Muticus

analysis <- "Mstr346"

admixturePlot(path, analysis, K = 5, "all", 8)

locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_filteredThin.recode.map"),
    header = F)

Using 41311 SNPs.

cvPlot(path, analysis)


M.structor queens

analysis <- "MstrQueens"

admixturePlot(path, analysis, K = 7, "all", 7)

locusID <- read.delim(paste0(path, "/", analysis, "/", analysis, "_filteredThin.recode.map"),
    header = F)

Using 31758 SNPs.

cvPlot(path, analysis)